Get data

Red tree voles in Humboldt County

voles <- read_sf(here("data", "redtreevoledata"),
                 layer = "ds033") %>% 
  dplyr::select(COUNTY) %>% 
  dplyr::filter(COUNTY == "HUM") %>% 
  st_transform(crs = 4326) #keeping only the lat/long format

#st_crs(voles)

plot(voles)

#read in data for Humboldt county
humboldt <- read_sf(here("data", "redtreevoledata"),
                 layer = "california_county_shape_file", crs = 4326) %>% 
  filter(NAME == "Humboldt") %>% 
  dplyr::select(NAME)

#st_crs(humboldt)

plot(humboldt)

tm_shape(humboldt)+
  tm_fill()+
  tm_shape(voles)+
  tm_dots(size = 0.15)

#Geocomputation in R (Robin Lovelace)

ggplot()+
  geom_sf(data = humboldt)+
  geom_sf(data = voles)

Convert vole events and Humboldt polygon to point pattern and window

#voles_sp <- as(voles, "Spatial") #convert to spatial dataframe
#voles_ppp <- as(voles_sp, "ppp")

Cluster analysis

k-means

iris_nice <- iris %>% 
  clean_names

ggplot(data = iris_nice)+
  geom_point(aes(x = petal_length,
                 y = petal_width, 
                 color = species))

#how many cluster do you think there should be for this dataset?

number_est <- NbClust(iris_nice[1:4],
                      min.nc = 2,
                      max.nc = 10,
                      method = "kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 11 proposed 2 as the best number of clusters 
## * 11 proposed 3 as the best number of clusters 
## * 1 proposed 8 as the best number of clusters 
## * 1 proposed 10 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************
#dindex is the number of clusters R finds (highest is 2) - but 3 species

#do kmeans
iris_km <- kmeans(iris_nice[1:4], 3) #must tell it the number of clusters

#bind the cluster number together with the original data
iris_cl <- data.frame(iris_nice, cluster_no = factor(iris_km$cluster))

#plot different clusters
ggplot(data = iris_cl)+
  geom_point(aes(x = sepal_length, y = sepal_width, color = cluster_no))

plot_ly(x = iris_cl$petal_length,
        y = iris_cl$petal_width,
        z = iris_cl$sepal_length,
        type = "scatter3d",
        color = iris_cl$cluster_no)

Hierarchical cluster analysis

#read in data
wb_env <- read_csv(here("data", "wb_env.csv"))

wb_ghg_20 <- wb_env %>% 
  arrange(-ghg) %>% 
  head(20)

wb_scaled <- as.data.frame(scale(wb_ghg_20[3:7]))

rownames(wb_scaled) <- wb_ghg_20$name

#find distances (create a dissimilarity matrix)
diss <- dist(wb_scaled, method = "euclidean", upper = TRUE)

#use eclidean distances to do some complete agglomerative clustering
hc_complete <- hclust(diss, method = "complete")

#plot it
plot(hc_complete, cex = 0.6, hang = -1)

ggdendrogram(hc_complete,
             rotate = TRUE)+
  theme_minimal()+
  labs(x = "Country")

China and US are similar to each other but VERY different from the rest of the countries